Code
library(tidyverse)
library(magrittr)
library(patchwork)
library(viridis)
library(gt)
tmp <- fs::dir_map("R", source)
tmp <- fs::dir_map("../R", source)
theme_set(theme_publication())library(tidyverse)
library(magrittr)
library(patchwork)
library(viridis)
library(gt)
tmp <- fs::dir_map("R", source)
tmp <- fs::dir_map("../R", source)
theme_set(theme_publication())mkdir -p lactinv_dropbox
rclone mount lactinv-dropbox:"/Gates LACTIN-V/" "lactinv_dropbox/"mae <- load_latest_mae(dir = str_c(data_dir(), "02_mae/"))Loading mae_20250704.RDS
raw_mae <- maeIn this document, we augment the MAE (MultiAssay Experiment) object with additional assays derived from the raw data.
colData(mae)For downstream analyses, it is convenient to add variables such as has_V0, has_V1, or has_V0_and_V1 to flag participants for which samples at Visit 0 (pre-MTZ) and/or Visit 1 (post-MTZ) are available.
clin <- MultiAssayExperiment::colData(mae) |> as.data.frame()
clin <-
clin |>
group_by(USUBJID) |>
mutate(
has_V0 = any(AVISITN == 0),
has_V1 = any(AVISITN == 1),
has_V0_and_V1 = has_V0 & has_V1
) |>
ungroup()
MultiAssayExperiment::colData(mae)$has_V0 <- clin$has_V0
MultiAssayExperiment::colData(mae)$has_V1 <- clin$has_V1
MultiAssayExperiment::colData(mae)$has_V0_and_V1 <- clin$has_V0_and_V1filter_criteria <-
list(min_tot_count = 50, n_samples = 2, min_counts_per_sample = 5)Sometimes, it is convenient to work with a “lighter” ASV table. To create that lighter table, we keep ASVs that have at least 5 counts in at least 2 samples and at least 50 counts across all samples.
non_zero_ASVs <-
filter_out_low_count_ASVs(
mae, assay_name = 'ASV_16S',
min_tot_count = filter_criteria$min_tot_count,
min_n_samples = filter_criteria$n_samples,
min_counts_per_sample = filter_criteria$min_counts_per_sample
)
non_zero_ASVs$plotWarning in scale_y_log10(str_c("number of samples with at least ",
min_counts_per_sample, : log-10 transformation introduced infinite values.
mae <- c(mae, non_zero_ASVs$SE)# statistics of what that does
n_all_ASV <-
MultiAssayExperiment::assay(mae,"ASV_16S") |> nrow()
n_non_zero_ASVs <-
SummarizedExperiment::assay(non_zero_ASVs$SE$ASV_16S_filtered) |> nrow()
perc_removed <- (100 * (1 - n_non_zero_ASVs/n_all_ASV)) |> round()
counts_all_ASVs <-
MultiAssayExperiment::assay(mae,"ASV_16S") |> sum()
counts_non_zero_ASVs <-
SummarizedExperiment::assay(non_zero_ASVs$SE$ASV_16S_filtered) |> sum()
perc_counts_removed <-
(100 * (1 - counts_non_zero_ASVs/counts_all_ASVs)) |> round(2)That removes 92% of ASVs, but only 0.33% of total counts.
We add three assays with relative abundances:
one with the relative abundance of ASVs
one with the relative abundances of taxa (ASV counts aggregated by taxonomic level)
one with the relative abundances of taxa and where we distinguish between CTV-05 and non-CTV-05 Lactobacillus crispatus
rel_abundances_ASV <-
get_relative_abundance_assay(mae, assay_name = "ASV_16S_filtered")
mae <- c(mae, rel_abundances_ASV)
rel_abundances_tax <- get_relative_abundance_assay(mae, assay_name = "tax_16S")
mae <- c(mae, rel_abundances_tax)ctv05_wide <- get_assay_wide_format(mae, "MG_CTV05")
props_with_ctv05 <-
ctv05_wide |>
select(Barcode, assay) |>
unnest(assay) |>
select(-`Non L. crispatus`) |>
dplyr::rename(
"CTV05" = `CTV-05`,
"non-CTV05 Lactobacillus crispatus" = `non-CTV-05 L. crispatus`,
"undetermined Lactobacillus crispatus" = `Undetermined L. crispatus`
) |>
full_join(
get_assay_wide_format(mae, "tax_16S_p", add_colData = FALSE) |>
unnest(assay) |>
select(-`Lactobacillus crispatus`),
by = join_by(Barcode)
) |>
as.data.frame() |>
column_to_rownames("Barcode") |>
as.matrix()
props_with_ctv05 <-
props_with_ctv05 / rowSums(props_with_ctv05)rowdata <-
SummarizedExperiment::rowData(mae[["tax_16S"]]) |>
as.data.frame()
j <- which(rownames(rowdata) == "Lactobacillus crispatus")
new_rowdata <-
bind_rows(
rowdata[rep(j, 3), ] |>
mutate(
strain = c("CTV-05", "non-CTV-05", "undetermined"),
tax_label = str_c("Lactobacillus crispatus (", strain, ")"),
key = tax_label
) |>
set_rownames(
c(
"CTV05",
"non-CTV05 Lactobacillus crispatus",
"undetermined Lactobacillus crispatus")
),
rowdata[-j,]
)new_assay <- list()
new_assay[["tax_CTV05_p"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = props_with_ctv05 |> t(),
rowData = new_rowdata
)
new_assay$tax_CTV05_p
class: SummarizedExperiment
dim: 222 1176
metadata(0):
assays(1): ''
rownames(222): CTV05 non-CTV05 Lactobacillus crispatus ... Leptotrichia
shahii Lautropia mirabilis
rowData names(13): Domain Phylum ... key strain
colnames(1176): 202302505 202302518 ... 202306743 203323517
colData names(0):
mae <- c(mae, new_assay)We create and add an assay that has the proportion of Lactobacillus in each sample
props <-
get_assay_long_format(mae, "tax_16S_p", add_colData = FALSE)
prop_Lacto <-
props |>
mutate(
category =
case_when(
str_detect(feature, "crispatus") ~ "L_crisp",
str_detect(feature, "iners") ~ "L_iners",
str_detect(feature, "Lactobacillus") ~ "other_Lacto",
TRUE ~ "non_Lacto"
)
) |>
group_by(Barcode, category) |>
summarize(prop = sum(value), .groups = "drop") |>
pivot_wider(id_cols = Barcode, names_from = category, values_from = prop,
names_prefix = "prop_") |>
mutate(
prop_Lacto = prop_other_Lacto + prop_L_crisp + prop_L_iners,
prop_non_iners_Lacto = prop_Lacto - prop_L_iners
) |>
select(Barcode, prop_L_crisp, prop_L_iners, prop_other_Lacto, prop_Lacto, prop_non_iners_Lacto, prop_non_Lacto) |>
as.data.frame()
prop_Lacto <- prop_Lacto |> set_rownames(prop_Lacto$Barcode) |> select(-Barcode)
prop_Lacto_assay <- list()
prop_Lacto_assay[["prop_Lacto"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = prop_Lacto |> t()
)
mae <- c(mae, prop_Lacto_assay)The distribution of Lactobacillus is as follows:
prop_Lacto |>
ggplot(aes(x = prop_Lacto)) +
geom_histogram(binwidth = 0.02) +
scale_x_continuous("Proportion of Lactobacillus", breaks = seq(0, 1, 0.1))The distribution of Lactobacillus crispatus is:
prop_Lacto |>
ggplot(aes(x = prop_L_crisp)) +
geom_histogram(binwidth = 0.02) +
scale_x_continuous("Proportion of L. crispatus", breaks = seq(0, 1, 0.1))And we also add an assay that categorize each target visit sample according to the endpoint definition.
categories_levels <-
c(
"≥ 50% L. crispatus",
"≥ 50% Lactobacillus, < 50% L. crispatus",
"< 50% Lactobacillus"
)
categories <-
get_assay_wide_format(mae, "prop_Lacto") |>
mutate(
category =
case_when(
(AVISITN >= 0) & (assay$prop_L_crisp >= 0.5) ~ categories_levels[1],
(AVISITN >= 0) & (assay$prop_Lacto >= 0.5) ~ categories_levels[2],
(AVISITN >= 0) ~ categories_levels[3],
TRUE ~ NA_character_
) |> factor(levels = categories_levels)
) |>
select(Barcode, category) |>
as.data.frame() |>
column_to_rownames("Barcode")
categories_assay <- list()
categories_assay[["mb_categories"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = categories |> t()
)
mae <- c(mae, categories_assay)The distribution of the microbiota categories is as follow:
ggplot(categories |> filter(!is.na(category)), aes(y = category)) +
geom_bar() +
ylab("") + xlab("Number of samples")categories_wide <-
categories |>
rownames_to_column() |>
mutate(tmp = (!is.na(category)) * 1) |>
pivot_wider(names_from = category, values_from = tmp, values_fill = 0) |>
select(-any_of("NA")) |>
mutate(across(all_of(categories_levels), .f = function(x){x; x[is.na(categories$category)] <- NA; x})) |>
as.data.frame() |>
column_to_rownames() |>
select(all_of(categories_levels))
categories_wide_assay <- list()
categories_wide_assay[["mb_categories_wide"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = categories_wide |> t()
)
mae <- c(mae, categories_wide_assay)We add an assay with the CST and sub-CST assignments (VALENCIA) for each sample.
counts <- MultiAssayExperiment::assay(mae, "tax_16S") |> t()
library(ValenciaR) # devtools::install_github("lasy/ValenciaR")
counts_Valencia <-
ValenciaR::convert_to_Valencia_taxonomy(
input = counts,
tax_table = SummarizedExperiment::rowData(mae[["tax_16S"]]) |> as.data.frame()
)
assignments <-
ValenciaR::assign_to_Valencia_clusters(
input = counts_Valencia$converted_input,
distance = "YC"
)Warning in ValenciaR::assign_to_Valencia_clusters(input = counts_Valencia$converted_input, : Assuming count data (as opposed to proportions) were provided.
Warning in ValenciaR::assign_to_Valencia_clusters(input = counts_Valencia$converted_input, : `input` does not have counts/proportions for all taxa represented in Valencia.
Data for 102/199 taxa are missing.
These missing taxa represent around 17% (1-83%) of Valencia cluster composition.
Their list is in the `missing_taxa` component of the returned value.
CST_assay <- list()
CST_assay[["CST"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = assignments$assignment |> t()
)
mae <- c(mae, CST_assay)The distribution of CST/sub-CSTs is as follow:
assignments$assignment |>
ggplot(aes(y = subCST |> fct_rev())) +
geom_bar() +
facet_grid(CST ~ ., scales = "free", space = "free") +
ylab("CST/subCST") +
theme(strip.text.y = element_text(angle = 0))And the distribution of dissimilarities between samples and assigned subCST centroids is as follow:
assignments$assignment |>
ggplot(aes(x = distance_to_subCST)) +
geom_histogram(binwidth = 0.02) +
facet_grid(subCST ~ ., scales = "free") +
stat_summary(
aes(x = 0.1, y = distance_to_subCST, xintercept = stat(y)),
fun.y = median, geom = "vline", col = "red"
) +
theme(strip.text.y = element_text(angle = 0)) +
xlab("Dissimilarity (Yue-Clayton) to subCST centroid")Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
ℹ Please use the `fun` argument instead.
Warning: `stat(y)` was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(y)` instead.
most_prevalent_genus <-
get_assay_long_format(mae, "tax_16S_p", add_colData = FALSE) |>
left_join(
SummarizedExperiment::rowData(mae[["tax_16S_p"]]) |>
as.data.frame() |>
rownames_to_column("feature") |>
as_tibble(),
by = join_by(feature)
) |>
group_by(Barcode, Genus) |>
summarize(rel_ab = sum(value), .groups = "drop") |>
arrange(Barcode, -rel_ab) |>
group_by(Barcode) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::rename(most_prev_genus = Genus) |>
arrange(-rel_ab) |>
mutate(most_prev_genus = most_prev_genus |> str_replace("Candidatus Lachnocurva", "Ca. Lachn. (BVAB1)") |> fct_inorder()) |>
arrange(Barcode) |>
select(-rel_ab)assignments$assignment |>
rownames_to_column("Barcode") |>
left_join(
most_prevalent_genus,
by = join_by(Barcode)
) |>
ggplot(aes(x = distance_to_subCST)) +
geom_histogram(binwidth = 0.02) +
stat_summary(
aes(x = 0.1, y = distance_to_subCST, xintercept = stat(y)),
fun.y = median, geom = "vline", col = "red"
) +
facet_grid(most_prev_genus ~ ., scales = "free") +
theme(strip.text.y = element_text(angle = 0)) +
xlab("Dissimilarity (Yue-Clayton) to subCST centroid")most_prevalent_genus |>
dplyr::count(most_prev_genus) |>
mutate(`%` = (100 * n/sum(n)) |> round()) |>
gt(caption = "Number and % of sample per most prevalent genus")| most_prev_genus | n | % |
|---|---|---|
| Lactobacillus | 644 | 55 |
| Gardnerella | 245 | 21 |
| Prevotella | 183 | 16 |
| Streptococcus | 2 | 0 |
| Ca. Lachn. (BVAB1) | 75 | 6 |
| Sneathia | 10 | 1 |
| Mobiluncus | 2 | 0 |
| Fannyhessea | 10 | 1 |
| Haemophilus | 1 | 0 |
| Actinomyces | 1 | 0 |
| Gemella | 1 | 0 |
assignments_BC <-
ValenciaR::assign_to_Valencia_clusters(
input = counts_Valencia$converted_input,
distance = "BC"
)Warning in ValenciaR::assign_to_Valencia_clusters(input = counts_Valencia$converted_input, : Assuming count data (as opposed to proportions) were provided.
Warning in ValenciaR::assign_to_Valencia_clusters(input = counts_Valencia$converted_input, : `input` does not have counts/proportions for all taxa represented in Valencia.
Data for 102/199 taxa are missing.
These missing taxa represent around 17% (1-83%) of Valencia cluster composition.
Their list is in the `missing_taxa` component of the returned value.
assignments_BC$assignment |>
ggplot(aes(x = distance_to_subCST)) +
geom_histogram(binwidth = 0.02) +
facet_grid(subCST ~ ., scales = "free") +
stat_summary(
aes(x = 0.1, y = distance_to_subCST, xintercept = stat(y)),
fun.y = median, geom = "vline", col = "red"
) +
theme(strip.text.y = element_text(angle = 0)) +
xlab("Dissimilarity (Bray-Curtis) to subCST centroid")assignments_BC$assignment |>
rownames_to_column("Barcode") |>
left_join(
most_prevalent_genus,
by = join_by(Barcode)
) |>
ggplot() +
aes(x = distance_to_subCST) +
facet_grid(most_prev_genus ~ ., scales = "free") +
geom_histogram(aes(y = ..density..), binwidth = 0.02) +
stat_summary(
aes(x = 0.1, y = distance_to_subCST, xintercept = stat(y)),
fun.y = median, geom = "vline", col = "red"
) +
theme(strip.text.y = element_text(angle = 0)) +
xlab("Dissimilarity (Bray-Curtis) to subCST centroid")Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.
We note that for many non-Lactobacillus-dominated subCSTs, the dissimilarity to the centroid is quite high, which suggests that these subCSTs are not well defined in this cohort.
assignments_BC$assignment |>
mutate(
`BC dissimilarity to centroid` = ifelse(distance_to_subCST > 0.5, "> 0.5", "≤ 0.5")
) |>
dplyr::count(`BC dissimilarity to centroid`) |>
mutate(`%` = (100 * n/sum(n)) |> round()) |>
gt()| BC dissimilarity to centroid | n | % |
|---|---|---|
| > 0.5 | 188 | 16 |
| ≤ 0.5 | 986 | 84 |
The median BC dissimilarity to centroids is 0.34.
tmp <-
assignments$distances |>
as.data.frame() |>
rownames_to_column("Barcode") |>
pivot_longer(-Barcode, names_to = "subCST", values_to = "distance") |>
group_by(Barcode) |>
arrange(Barcode, distance) |>
slice_head(n = 2) |>
summarize(
diff_dist = distance[2] - distance[1],
distance_with_next_closest_subCST = distance[2],
.groups = "drop"
) |>
left_join(
assignments$assignment |> rownames_to_column("Barcode"), by = join_by(Barcode)
) |>
group_by(subCST) |>
mutate(goodness_of_assignment = (diff_dist - 2 * distance_to_subCST)/max(distance_to_subCST)) |>
ungroup()
tmp |>
ggplot() +
aes(x = distance_to_subCST, y = distance_with_next_closest_subCST, col = goodness_of_assignment) +
facet_wrap(subCST ~ .) +
geom_abline(intercept = 0, slope = 1) +
geom_abline(intercept = 0.05, slope = 1.2, linetype = 3, col = "red", alpha = 0.5) +
geom_point(alpha = 0.5) +
scale_color_gradient("Goodness of\nsubCST assignment", low = "red", high = "steelblue1") +
xlab("YC dissimilarity to closest subCST") +
ylab("YC dissimilarity to next closest subCST") tmp <-
assignments_BC$distances |>
as.data.frame() |>
rownames_to_column("Barcode") |>
pivot_longer(-Barcode, names_to = "subCST", values_to = "distance") |>
group_by(Barcode) |>
arrange(Barcode, distance) |>
slice_head(n = 2) |>
summarize(
diff_dist = distance[2] - distance[1],
distance_with_next_closest_subCST = distance[2],
.groups = "drop"
) |>
left_join(
assignments_BC$assignment |> rownames_to_column("Barcode"), by = join_by(Barcode)
) |>
group_by(subCST) |>
mutate(goodness_of_assignment = (diff_dist - 2 * distance_to_subCST)/max(distance_to_subCST)) |>
ungroup()
tmp |>
ggplot() +
aes(x = distance_to_subCST, y = distance_with_next_closest_subCST, col = goodness_of_assignment) +
facet_wrap(subCST ~ .) +
geom_abline(intercept = 0, slope = 1) +
geom_abline(intercept = 0.05, slope = 1.2, linetype = 3, col = "red", alpha = 0.5) +
geom_point(alpha = 0.5) +
scale_color_gradient("Goodness of\nsubCST assignment", low = "red", high = "steelblue1") +
xlab("BC dissimilarity to closest subCST") +
ylab("BC dissimilarity to next closest subCST") tmp <-
tmp |>
mutate(
`Distance to centroids` =
ifelse(
distance_with_next_closest_subCST < (0.05 + 1.2 * distance_to_subCST),
"Almost equidistant to two CTS/subCSTs",
"Closer to assigned CST/subCST"
)
)
tmp |>
dplyr::count(`Distance to centroids`) |>
mutate(`%` = (100 * n/sum(n)) |> round()) |>
gt::gt()| Distance to centroids | n | % |
|---|---|---|
| Almost equidistant to two CTS/subCSTs | 464 | 40 |
| Closer to assigned CST/subCST | 710 | 60 |
We also note that, in addition to being very far from their respective subCST centroids, about 40% of samples are almost equally close to their assigned subCST than to the next closest subCST (below the dashed line). This proportion varies widely by CST/subCSTs:
tmp |>
group_by(subCST) |>
dplyr::count(`Distance to centroids`) |>
mutate(`%` = (100 * n/sum(n)) |> round()) |>
gt::gt(row_group_as_column = TRUE)| Distance to centroids | n | % | |
|---|---|---|---|
| I-A | Almost equidistant to two CTS/subCSTs | 8 | 5 |
| Closer to assigned CST/subCST | 168 | 95 | |
| I-B | Almost equidistant to two CTS/subCSTs | 45 | 90 |
| Closer to assigned CST/subCST | 5 | 10 | |
| II | Almost equidistant to two CTS/subCSTs | 1 | 100 |
| III-A | Almost equidistant to two CTS/subCSTs | 35 | 19 |
| Closer to assigned CST/subCST | 154 | 81 | |
| III-B | Almost equidistant to two CTS/subCSTs | 107 | 69 |
| Closer to assigned CST/subCST | 48 | 31 | |
| IV-A | Almost equidistant to two CTS/subCSTs | 66 | 37 |
| Closer to assigned CST/subCST | 114 | 63 | |
| IV-B | Almost equidistant to two CTS/subCSTs | 190 | 50 |
| Closer to assigned CST/subCST | 189 | 50 | |
| IV-C0 | Almost equidistant to two CTS/subCSTs | 4 | 100 |
| IV-C1 | Almost equidistant to two CTS/subCSTs | 1 | 33 |
| Closer to assigned CST/subCST | 2 | 67 | |
| V | Almost equidistant to two CTS/subCSTs | 7 | 19 |
| Closer to assigned CST/subCST | 30 | 81 |
For the non-Lactobacillus dominated samples (i.e., samples in which the proportion of Lactobacillus is < 80%), that proportion is even higher.
tmp |>
left_join(
get_assay_long_format(mae, "prop_Lacto", add_colData = FALSE) |>
filter(feature == "prop_Lacto") |>
dplyr::rename(prop_Lacto = value) |>
select(Barcode, prop_Lacto),
by = join_by(Barcode)
) |>
mutate(
Lacto_group = ifelse(prop_Lacto >= 0.5, "≥ 50% Lactobacillus", "< 50% Lactobacillus")
) |>
group_by(Lacto_group) |>
dplyr::count(`Distance to centroids`) |>
mutate(`%` = (100 * n/sum(n)) |> round()) |>
gt::gt(row_group_as_column = TRUE)| Distance to centroids | n | % | |
|---|---|---|---|
| < 50% Lactobacillus | Almost equidistant to two CTS/subCSTs | 281 | 48 |
| Closer to assigned CST/subCST | 307 | 52 | |
| ≥ 50% Lactobacillus | Almost equidistant to two CTS/subCSTs | 183 | 31 |
| Closer to assigned CST/subCST | 403 | 69 |
Analyses above show that many samples do not fit well into the CST/subCST framework.
An alternative to classifying samples so single CST/subCST is to use topic models, which are mixed membership models that allow each sample to be a mixture of subcommunities (a mixture of several topics) (see Sankaran and Holmes, 2019 and Symul et al., 2023).
We fit the topics de novo, that is we do not use “reference” subcommunities, but rather identify these subcommunities from the data.
We do this using the alto package, which is a wrapper around the lda package, and which allows to examine how subcommunities relate to each other across models with different number of subcommunities.
So, we first fit the topic models for different number of subcommunities (K), and then we label the sub-communities according to their taxonomic composition and such that their label match their most similar subCST.
counts <- MultiAssayExperiment::assay(mae, "tax_16S") |> t()
max_K <- 15We fit the topic models for K in 1 to 15.
library(alto) # devtools::install_github("lasy/alto")
lda_models <-
run_lda_models(
data = counts,
lda_varying_params_lists =
setNames(purrr::map(1:max_K, ~ list(k = .)), 1:max_K),
dir = str_c(data_dir(),"/03_augmented_mae/lda_models/"),
reset = FALSE,
verbose = TRUE
)
valencia_centroids_mat <-
ValenciaR::get_Valencia_clusters()
labelled_lda_models <-
label_models_topics(
lda_models = lda_models,
tax_table =
SummarizedExperiment::rowData(mae[["tax_16S"]]) |> as.data.frame(),
valencia_centroids_mat = valencia_centroids_mat,
distance = "YC"
)The alignments of topics look like this:
aligned_topics_transport <-
align_topics(models = labelled_lda_models, method = "transport")
plot(aligned_topics_transport) +
labs(title = '"Transport" alignment',
subtitle = "(= based on the similarities of sub-community compositions)",
xlab = "Number of sub-communities (topics)")aligned_topics_product <-
align_topics(models = labelled_lda_models, method = "product")
plot(aligned_topics_product) +
labs(title = '"Product" alignment',
subtitle = "(= based on the similarities of sample compositions)",
xlab = "Number of sub-communities (topics)")One of the metrics used to identify an optimal number of topic is the number of path identified throughout K.
topics <-
bind_rows(
aligned_topics_transport@topics |> mutate(method = "transport"),
aligned_topics_product@topics |> mutate(method = "product")
)
plot_number_of_paths(compute_number_of_paths(aligned_topics_transport)) +
ggtitle("Transport") +
plot_number_of_paths(compute_number_of_paths(aligned_topics_product)) +
ggtitle("Product")This suggests that there might be 7 or 9 true sub-communities.
We can also examine the distribution of coherence and refinement scores:
ggplot(topics, aes(x = m, y = coherence, fill = method, col = method)) +
geom_boxplot(alpha = 0.5) +
ggplot(topics, aes(x = m, y = refinement, fill = method, col = method)) +
geom_abline(slope = 1, intercept = 0) +
geom_boxplot(alpha = 0.5) +
plot_layout(guides = "collect")Warning: Removed 30 rows containing non-finite outside the scale range
(`stat_boxplot()`).
We see a first drop around K = 4-5, and a second drop at K = 8-9.
We can check the sub-communities composition for a selection of Ks:
plot_beta(aligned_topics_transport, models = c(5, 6, 7, 9)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))From these plots, it looks like K = 7 might be a good compromise between dimension reduction, representation accuracy, and minimizing the number of spurious topics.
We note that at none of these resolutions (K = 5, 6, 7, 9) do we find any topic similar to the subCSTs IV-C(0-4).
We store the sample topic-mixtures and the topic composition as a SE in the MAE
optimal_K <- 7
topic_SE <-
make_topic_SE(
assay_name = str_c("topics_16S_", optimal_K),
lda_model = labelled_lda_models[[optimal_K]]
)
mae <- c(mae, topic_SE)One of the problem with the topic models fitted above is that it may happen that non-Lacto species are mixed with Lacto species in the topic models. This may reflect actual biology, which would be interesting to study, but this may also simply correspond to a data-set specific “statistical optimum” that does not have a biological interpretation.
The main “issue” is with the topic V dominated by L. jensenii:
beta_long <- get_beta_long(mae, assayname = "topics_16S_7")
beta_long |>
filter(topic_subcst_matching_label %in% c("I","III","V")) |>
filter(prop > 1/500) |>
arrange(topic, -prop) |>
mutate(taxa = taxa |> factor(levels = unique(taxa))) |>
ggplot() +
aes(y = taxa |> fct_rev(), x = prop, fill = topic) +
geom_col() +
facet_grid(. ~ topic_label) +
guides(fill = "none") +
ylab("") + xlab("proportion in topic") +
scale_fill_viridis(
discrete = TRUE, option = "A", direction = -1, end = 0.85, begin = 0.2
)One potential simple solution is to constraint “Lactobacillus topics” to only contain Lactobacillus species and fit the topic model on the remaining non-Lactobacillus species.
counts <- MultiAssayExperiment::assay(mae, "tax_16S") |> t()
props <- MultiAssayExperiment::assay(mae, "tax_16S_p") |> t()
props_long <-
get_assay_long_format(
mae, "tax_16S_p", add_colData = FALSE,
feature_name = "taxa", values_name = "prop")props_long_lacto <-
props_long |>
filter(str_detect(taxa, "Lactobacillus")) |>
group_by(taxa) |>
mutate(median_prop = median(prop)) |>
ungroup() |>
arrange(-median_prop) |>
mutate(
taxa = taxa |> str_replace("Lactobacillus", "L."),
taxa = taxa |> factor(levels = unique(taxa))
)
# props_long_lacto |>
# ggplot(aes(x = taxa, y = prop)) +
# geom_boxplot(outlier.alpha = 0.25, outlier.size = 0.5, fill = "antiquewhite3", col = "antiquewhite4") +
# xlab("Taxa")
g <-
props_long_lacto |>
ggplot(aes(y = taxa |> fct_rev(), x = prop, col = taxa)) +
geom_jitter(width = 0, height = 0.15, size = 0.5) +
xlab("Proportion in samples") + ylab("") +
scale_color_manual(values = get_topic_colors(c("III","I", "V", rep("VI", 7)))) +
guides(col = "none")
gggsave(g, filename = str_c(fig_out_dir(), "S2C.pdf"), width = 10, height = 3, device = cairo_pdf)Among all Lactobacillus species, there are only 3 making more than 50% of a sample in at least 10 samples: Lactobacillus iners, crispatus, and jensenii.
So, we’ll create 4 Lactobacillus topics:
topic I : 100% L. crispatus
topic II : 100% L. iners
topic V: 100% L. jensenii
topic VI: mix of the remaining Lactobacillus species
Lacto_topics_beta <-
matrix(0, nrow = ncol(counts), ncol = 4) |>
set_colnames(c("I","III","V","VI")) |>
set_rownames(colnames(counts))
Lacto_topics_beta["Lactobacillus crispatus","I"] <- 1
Lacto_topics_beta["Lactobacillus iners","III"] <- 1
Lacto_topics_beta["Lactobacillus jensenii","V"] <- 1
all_Lacto_species <- str_subset(colnames(props), "Lactobacillus")
other_Lacto_species <-
setdiff(
all_Lacto_species,
str_c("Lactobacillus ", c("crispatus","iners","jensenii"))
)
Lacto_topics_beta[other_Lacto_species,"VI"] <-
colSums(props[,other_Lacto_species])/sum(props[,other_Lacto_species])Lacto_topics_beta |>
as.data.frame() |>
rownames_to_column("Taxa") |>
pivot_longer(-Taxa, names_to = "topic", values_to = "prop") |>
filter(prop > 0) |>
arrange(topic, -prop) |>
mutate(Taxa = Taxa |> factor(levels = unique(Taxa))) |>
ggplot(aes(y = Taxa |> fct_rev(), x = topic)) +
geom_tile(aes(alpha = prop), fill = "#F56172") +
geom_text(aes(label = round(prop,2)), size = 4) +
ylab("") + xlab("Topic") Lacto_topics_gamma <-
cbind(
props[, str_c("Lactobacillus ", c("crispatus","iners","jensenii"))],
rowSums(props[, other_Lacto_species])
) |>
set_colnames(c("I","III","V","VI"))The distribution of these topics in samples is as follow:
Lacto_topics_gamma |>
as.data.frame() |>
rownames_to_column("sample") |>
pivot_longer(-sample, names_to = "topic", values_to = "prop") |>
ggplot(aes(x = prop)) +
geom_histogram(bins = 50) +
facet_grid(topic ~ ., labeller = label_both) +
theme(strip.text.y = element_text(angle = 0))We now fit the topic models for all non-Lactobacillus species. We do this by fitting topic models on the counts of non-Lactobacillus species for K = 1 to 15.
# we remove the Lacto species
non_Lacto_counts <- counts[, !(colnames(counts) %in% all_Lacto_species)]
# we remove the samples that only have Lacto species
non_Lacto_counts <- non_Lacto_counts[rowSums(non_Lacto_counts) != 0, ]
max_K <- 15
library(alto) # devtools::install_github("lasy/alto")
lda_models <-
run_lda_models(
data = non_Lacto_counts,
lda_varying_params_lists =
setNames(purrr::map(1:max_K, ~ list(k = .)), 1:max_K),
dir = str_c(data_dir(), "03_augmented_mae/lda_models_nonLacto/"),
reset = FALSE,
verbose = TRUE
)
tax_table <-
SummarizedExperiment::rowData(mae[["tax_16S"]]) |>
as.data.frame()
tax_table <- tax_table[colnames(non_Lacto_counts), ]
labelled_lda_models <-
label_models_topics(
lda_models = lda_models,
tax_table = tax_table,
valencia_centroids_mat = ValenciaR::get_Valencia_clusters(),
distance = "YC"
)Just as we did for the topic models above, we can compute the alignment of topics across K and plot the coherence and refinement scores of topics as a function of K.
aligned_topics_transport <-
align_topics(models = labelled_lda_models, method = "transport")
plot(aligned_topics_transport) +
labs(title = '"Transport" alignment',
subtitle = "(= based on the similarities of sub-community compositions)",
xlab = "Number of sub-communities (topics)")aligned_topics_product <-
align_topics(models = labelled_lda_models, method = "product")
plot(aligned_topics_product) +
labs(title = '"Product" alignment',
subtitle = "(= based on the similarities of sample compositions)",
xlab = "Number of sub-communities (topics)")To identify the optimal K, we display the diagnostics scores (number of path, refinement, coherence) across resolution.
topics <-
bind_rows(
aligned_topics_transport@topics |> mutate(method = "transport"),
aligned_topics_product@topics |> mutate(method = "product")
)
plot_number_of_paths(compute_number_of_paths(aligned_topics_transport)) +
ggtitle("Transport") +
plot_number_of_paths(compute_number_of_paths(aligned_topics_product)) +
ggtitle("Product")g <-
plot(aligned_topics_transport) +
labs(
title = 'Topic alignment across K',
subtitle = '"Transport" alignment, based on topic similarities'
) +
xlab("K: Number of topics") +
plot_number_of_paths(compute_number_of_paths(aligned_topics_product)) +
labs(
title = 'Number of path across K',
subtitle = 'Alignment based on sample similarities'
) +
xlab("K") +
theme_publication() +
plot_layout(widths = c(1.4, 1)) +
plot_annotation(tag_levels = "A")
gggsave(g, filename = str_c(fig_out_dir(), "S2AB.pdf"), width = 13, height = 6, device = cairo_pdf)ggplot(topics, aes(x = m, y = coherence, fill = method, col = method)) +
geom_boxplot(alpha = 0.5) +
ggplot(topics, aes(x = m, y = refinement, fill = method, col = method)) +
geom_abline(slope = 1, intercept = 0) +
geom_boxplot(alpha = 0.5) +
plot_layout(guides = "collect")Warning: Removed 30 rows containing non-finite outside the scale range
(`stat_boxplot()`).
The composition of topics at various resolutions can be visualized as follows:
plot_beta(aligned_topics_transport, models = c(4, 5, 6, 11)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) From these visualizations, it looks like K = 4 or K = 5 would be good choices for the number of non-Lactobacillus topics.
We will thus save two topic models: one with K = 4 and one with K = 5, which we both combine with the “Lactobacillus topics” defined above. We then store these two models as a SE in the MAE.
K = 4
K = 4 is a good choice because it is a stable number of path across many K when relying on the product alignment.
combined_topics_K4 <-
combine_topics(4, labelled_lda_models, Lacto_topics_gamma, Lacto_topics_beta)
topic_SE <-
make_topic_SE(
assay_name = str_c("c_topics_16S_", nrow(combined_topics_K4$beta)),
lda_model = combined_topics_K4
)
mae <- c(mae, topic_SE)plot_topic_betas(mae, "c_topics_16S_8") +
xlab("topic number")K = 5
K = 5 might also a good choice because the transport alignment suggests that these 5 topics are quite well preserved at higher resolutions. It also highlights a little better the co-occurence of Fannyhessea vaginae (prev. Atopobium vaginae) and G. swidsinskii/leopoldii and/or G. vaginalis (but not with G. piotii), which has been observed in previous cohorts (see Symul et al, 2023).
combined_topics_K5 <-
combine_topics(5, labelled_lda_models, Lacto_topics_gamma, Lacto_topics_beta)
topic_SE <-
make_topic_SE(
assay_name = str_c("c_topics_16S_", nrow(combined_topics_K5$beta)),
lda_model = combined_topics_K5
)
mae <- c(mae, topic_SE)plot_topic_betas(mae, "c_topics_16S_9")taxa_props <- get_assay_wide_format(mae, "tax_16S_p")
taxa_props |>
select(Barcode, assay) |>
unnest(cols = c(assay)) |>
select(Barcode, contains("Gardnerella") | contains("Fannyhessea")) |>
pivot_longer(cols = contains("Gardnerella"),
names_to = "Gardnerella", values_to = "prop") |>
group_by(Gardnerella) |>
mutate(cor = cor(prop, `Fannyhessea vaginae`)) |>
ungroup() |>
ggplot(aes(x = `Fannyhessea vaginae`, y = prop, col = Gardnerella)) +
geom_point(size = 0.5, alpha = 0.5) +
facet_grid(. ~ Gardnerella) +
guides(col = "none")However, we note here that the proportions of G. piotii are very low in this cohort so we do not observe strong mutual exclusions with Fannyhessea.
Since both models (K = 4 or K = 5) are saved, we can use either of them in downstream analyses.
We divide topic I defined above into two topics: one 100% composed of the CTV-05 strain and one 100% composed of the other Lactobacillus crispatus strains.
Topic I.a: CTV-05
Topic I.b: non-CTV-05 Lactobacillus crispatus
Topic I.c: undetermined Lactobacillus crispatus
Lacto_topics_beta <-
matrix(0, nrow = ncol(counts) - 1, ncol = 6) |>
set_rownames(colnames(counts) |> str_subset("crispatus", negate = TRUE))
Lacto_topics_beta <-
matrix(0, nrow = 3, ncol = 6) |>
set_rownames(str_c("Lactobacillus crispatus (", c("CTV05", "non-CTV05", "?"), ")")) |>
rbind(Lacto_topics_beta) |>
set_colnames(c("I.a", "I.b", "I.c","III","V","VI"))
Lacto_topics_beta["Lactobacillus crispatus (CTV05)","I.a"] <- 1
Lacto_topics_beta["Lactobacillus crispatus (non-CTV05)","I.b"] <- 1
Lacto_topics_beta["Lactobacillus crispatus (?)","I.c"] <- 1
Lacto_topics_beta["Lactobacillus iners","III"] <- 1
Lacto_topics_beta["Lactobacillus jensenii","V"] <- 1
all_Lacto_species <- str_subset(colnames(props), "Lactobacillus")
other_Lacto_species <-
setdiff(
all_Lacto_species,
str_c("Lactobacillus ", c("crispatus","iners","jensenii"))
)
Lacto_topics_beta[other_Lacto_species,"VI"] <-
colSums(props[,other_Lacto_species])/sum(props[,other_Lacto_species])Lacto_topics_beta |>
as.data.frame() |>
rownames_to_column("Taxa") |>
pivot_longer(-Taxa, names_to = "topic", values_to = "prop") |>
filter(prop > 0) |>
arrange(topic, -prop) |>
mutate(Taxa = Taxa |> factor(levels = unique(Taxa))) |>
ggplot(aes(y = Taxa |> fct_rev(), x = topic)) +
geom_tile(aes(alpha = prop), fill = "#F56172") +
geom_text(aes(label = round(prop,2)), size = 4) +
ylab("") + xlab("Topic") ctv05_wide <- get_assay_wide_format(mae, "MG_CTV05")
j <- which(colnames(props) == "Lactobacillus crispatus")
props_with_ctv05 <-
ctv05_wide |>
select(Barcode, assay) |>
unnest(assay) |>
select(-`Non L. crispatus`) |>
dplyr::rename(
"Lactobacillus crispatus (CTV05)" = `CTV-05`,
"Lactobacillus crispatus (non-CTV05)" = `non-CTV-05 L. crispatus`,
"Lactobacillus crispatus (?)" = `Undetermined L. crispatus`
) |>
inner_join(
props[, -j] |> as.data.frame() |> rownames_to_column("Barcode"),
by = join_by(Barcode)
) |>
as.data.frame() |>
column_to_rownames("Barcode") |>
as.matrix()
props_with_ctv05 <-
props_with_ctv05 / rowSums(props_with_ctv05)pure_lacto_topics <-
c(
"Lactobacillus crispatus (CTV05)",
"Lactobacillus crispatus (non-CTV05)",
"Lactobacillus crispatus (?)",
"Lactobacillus iners",
"Lactobacillus jensenii"
)
Lacto_topics_gamma <-
cbind(
props_with_ctv05[, pure_lacto_topics],
rowSums(props_with_ctv05[, other_Lacto_species])
) |>
set_colnames(c("I.a", "I.b", "I.c","III","V","VI"))The distribution of these topics in samples is as follow:
Lacto_topics_gamma |>
as.data.frame() |>
rownames_to_column("sample") |>
pivot_longer(-sample, names_to = "topic", values_to = "prop") |>
ggplot(aes(x = prop)) +
geom_histogram(bins = 50) +
facet_grid(topic ~ ., labeller = label_both) +
theme(strip.text.y = element_text(angle = 0))combined_topics_K4 <-
combine_topics(4, labelled_lda_models, Lacto_topics_gamma, Lacto_topics_beta)
topic_SE <-
make_topic_SE(
assay_name = str_c("c_topics_16S_", nrow(combined_topics_K4$beta), "_ctv05"),
lda_model = combined_topics_K4
)
mae <- c(mae, topic_SE)plot_topic_betas(mae, "c_topics_16S_10_ctv05")In this section, we filter and transform cytokine data.
cytok <-
get_assay_long_format(
mae, "cytokine", add_colData = TRUE,
feature_name = "cytokine", values_name = "concentration"
)g_raw_cytok <-
cytok |>
ggplot(aes(x = concentration, fill = cytokine)) +
geom_histogram(bins = 50) +
facet_wrap(cytokine ~ .) +
guides(fill = "none") +
ggtitle('Distribution of "raw" cytokine concentrations',
subtitle = "with imputed ULOQ and LLOQ values")
g_raw_cytok g_raw_cytok +
scale_x_log10() From these first plots, it is clear that the data needs some transformation, potentially a log transformation if that also helps stabilizing the variance. We also see that for many samples, concentrations were either below or above the limits of quantification (LLOQ and ULOQ) and were imputed.
We also look at the distribution per sample:
# cytok |>
# group_by(Barcode) |>
# mutate(median_log_concentration = median(log(concentration))) |>
# ungroup() |>
# arrange(median_log_concentration) |> mutate(Barcode = Barcode |> fct_inorder()) |>
# ggplot(aes(x = Barcode, y = concentration)) +
# # geom_boxplot(outlier.shape = NA, linewidth = 0.2) +
# geom_point(aes(col = cytokine), size = 0.5, alpha = 0.6) +
# geom_point(stat = "summary", fun = median, col = "black") +
# scale_y_log10() +
# theme(axis.text.x = element_blank())
# cytok |>
# group_by(Barcode) |>
# mutate(median_log_concentration = median(log(concentration))) |>
# ungroup() |>
# arrange(median_log_concentration) |> mutate(Barcode = Barcode |> fct_inorder()) |>
# ggplot(aes(x = Barcode, y = concentration)) +
# # geom_boxplot(outlier.shape = NA, linewidth = 0.2) +
# geom_point(aes(col = cytokine), size = 0.5, alpha = 0.6) +
# scale_y_log10() +
# facet_grid(cytokine ~ .) +
# theme(axis.text.x = element_blank())
cytok |>
group_by(Barcode) |>
mutate(median_log10_concentration_per_sample = median(log10(concentration))) |>
ungroup() |>
arrange(median_log10_concentration_per_sample) |>
mutate(Barcode = Barcode |> fct_inorder()) |>
group_by(cytokine) |>
mutate(median_log10_concentration_per_cytokine = median(log10(concentration))) |>
ungroup() |>
arrange(median_log10_concentration_per_cytokine) |>
mutate(cytokine = cytokine |> fct_inorder()) |>
ggplot(aes(x = Barcode, y = cytokine, fill = concentration |> log10())) +
geom_tile() +
theme(axis.text.x = element_blank()) +
scale_fill_gradient(low = "seashell", high = "midnightblue")We see that there might be a strong “size effect” in the data as samples with high levels of one cytokines appear to also have high levels of other cytokines.
We check if a \(\log_{10}\) transformation stabilizes the variance of the data.
cytok <-
cytok |> mutate(log10_concentration = log10(concentration)) |>
select(Barcode, cytokine, concentration, log10_concentration, everything())
cytok_summary <-
cytok |>
group_by(cytokine) |>
summarize(
`mean concentration` = mean(concentration, na.rm = TRUE),
`var concentration` = var(concentration, na.rm = TRUE),
`mean log10(concentration)` = mean(log10_concentration, na.rm = TRUE),
`var log10(concentration)` = var(log10_concentration, na.rm = TRUE),
.groups = "drop"
)
cytok_summary |>
ggplot(aes(x = `mean concentration`, y = `var concentration`)) +
geom_point() +
geom_smooth(method = "lm", formula = "y ~ x") +
scale_x_log10() + scale_y_log10() +
ggtitle("Mean-Variance on raw concentration") +
cytok_summary |>
ggplot(aes(x = `mean log10(concentration)`, y = `var log10(concentration)`)) +
geom_point() +
geom_smooth(method = "lm", formula = "y ~ x") +
ggtitle("Mean-Variance on log10-transformed concentration") +
labs(caption = "Each dot is a cytokine")The log10-transformation seems to stabilize the variance, although there might still be residual heteroskedasticity. We will use the log10-transformed data for downstream analyses.
We add an assay with the log10-transformed data to our multi-assay experiment object:
cytokines <- MultiAssayExperiment::assay(mae, "cytokine")
transformed_cytokines <- cytokines |> log10()
transformed_cytokines_assay <- list()
transformed_cytokines_assay[["cytokine_log10"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = transformed_cytokines
)
mae <- c(mae, transformed_cytokines_assay)As seen on one of the plots above, there might be a strong size effect in the data. This size effect might be a biological size effect (co-regulation of all cytokines) or an artifact related to the amount of material that was on each swab.
First, let’s characterize that effect by computing the correlations between cytokine transformed concentrations.
cytok_corr <- cor(transformed_cytokines |> t())
res <- Hmisc::rcorr(transformed_cytokines |> t())$P |> p.adjust("BH") |> range(na.rm = TRUE)
corrplot::corrplot(cytok_corr, method = "color", diag = FALSE, addCoef.col = "black", tl.col = "black", tl.srt = 45)ggsave(g, filename = str_c(fig_out_dir(), "S4B.pdf"), width = 9, height = 9, device = cairo_pdf)Correlations are all highly positive (and statistically different from 0).
To assess whether this size effect might be a technical or biological effect, we examine the association between the per-sample-mean of the scaled log10 concentrations and a series of variables that may drive that biological effect. The variables we check for are:
cytok <-
get_assay_long_format(
mae, "cytokine_log10", feature_name = "cytokine", values_name = "log10_concentration"
) |>
left_join(
get_assay_wide_format(mae, "prop_Lacto", add_colData = FALSE) |> dplyr::rename(MB_props = assay),
by = join_by(Barcode)
) |>
mutate(Visit = AVISITN |> get_visit_labels())
cytok_summary <-
cytok |>
group_by(cytokine) |> mutate(scaled_log10_concentration = scale(log10_concentration)) |> ungroup() |>
mutate(prop_Lacto = MB_props$prop_Lacto, prop_non_iners_Lacto = MB_props$prop_non_iners_Lacto) |>
group_by(Barcode, USUBJID, Visit, AGE, RACEGR2, BC, prop_Lacto, prop_non_iners_Lacto) |>
summarize(`mean z-log10(conc.)` = mean(scaled_log10_concentration), .groups = "drop")cytok_summary |>
ggplot(aes(x = Visit, y = `mean z-log10(conc.)`)) +
geom_path(aes(group = USUBJID), col = "gray70", alpha = 0.3) +
geom_boxplot(alpha = 0.5) +
geom_jitter(height = 0, width = 0.2) lm(`mean z-log10(conc.)` ~ Visit, data = cytok_summary) |> summary()
Call:
lm(formula = `mean z-log10(conc.)` ~ Visit, data = cytok_summary)
Residuals:
Min 1Q Median 3Q Max
-1.32782 -0.52355 -0.09988 0.39511 2.52702
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.09340 0.04792 1.949 0.0515 .
VisitPost-MTZ -0.31281 0.06714 -4.659 3.55e-06 ***
VisitWeek 4 -0.06772 0.06986 -0.969 0.3326
VisitWeek 8 -0.02361 0.07037 -0.336 0.7373
VisitWeek 12 -0.07429 0.07006 -1.060 0.2892
VisitWeek 16 0.32725 0.19714 1.660 0.0972 .
VisitWeek 20 0.24407 0.24844 0.982 0.3261
VisitWeek 24 -0.09972 0.07220 -1.381 0.1675
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6895 on 1143 degrees of freedom
Multiple R-squared: 0.02936, Adjusted R-squared: 0.02341
F-statistic: 4.938 on 7 and 1143 DF, p-value: 1.628e-05
Concentrations are a significantly lower at the post-MTZ visit, which is consistent with both a biological effect or a technical artifact due to amount on swab.
cytok_summary <-
cytok_summary |>
group_by(USUBJID) |> mutate(median_mean = median(`mean z-log10(conc.)`)) |> ungroup() |>
arrange(median_mean) |> mutate(USUBJID = USUBJID |> fct_inorder())
cytok_summary |>
ggplot(aes(x = USUBJID, y = `mean z-log10(conc.)`)) +
geom_boxplot(fill = "aquamarine3") # + geom_point(aes(color = Visit))lm(`mean z-log10(conc.)` ~ USUBJID, data = cytok_summary) |> anova()Analysis of Variance Table
Response: mean z-log10(conc.)
Df Sum Sq Mean Sq F value Pr(>F)
USUBJID 211 273.62 1.29680 4.255 < 2.2e-16 ***
Residuals 939 286.18 0.30477
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It also looks like there is a strong participant-effect.
Let’s explore this in more details for 10 random participants:
selected_USUBJID <- levels(cytok_summary$USUBJID)
selected_USUBJID <- selected_USUBJID[seq(1, length(selected_USUBJID), len = 10)|> round() ]
cytok |>
group_by(cytokine) |>
mutate(z_log10_concentration = scale(log10_concentration)) |>
ungroup() |>
filter(USUBJID %in% selected_USUBJID) |>
ggplot(aes(x = cytokine, y = Visit |> fct_rev(), fill = z_log10_concentration)) +
geom_tile() +
facet_grid(USUBJID ~ ., scales = "free", space = "free") +
scale_fill_gradient2() +
ylab("Visit")cytok_summary |>
ggplot(aes(x = AGE, y = `mean z-log10(conc.)`, col = USUBJID, group = USUBJID)) +
geom_path(position = position_dodge(width = 0.4)) +
geom_point(position = position_dodge(width = 0.4), alpha = 0.7) +
guides(col = "none")# lm_model <- lm(`mean z-log10(conc.)` ~ AGE + USUBJID, data = cytok_summary)
# summary(lm_model)
lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ AGE + (1|USUBJID), data = cytok_summary)
summary(lme_model)Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ AGE + (1 | USUBJID)
Data: cytok_summary
REML criterion at convergence: 2203.4
Scaled residuals:
Min 1Q Median 3Q Max
-2.4454 -0.6196 -0.1534 0.5402 3.7245
Random effects:
Groups Name Variance Std.Dev.
USUBJID (Intercept) 0.1695 0.4117
Residual 0.3059 0.5530
Number of obs: 1151, groups: USUBJID, 212
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.502676 0.149976 3.352
AGE -0.016438 0.004741 -3.467
Correlation of Fixed Effects:
(Intr)
AGE -0.976
car::Anova(lme_model, type = "II")Analysis of Deviance Table (Type II Wald chisquare tests)
Response: mean z-log10(conc.)
Chisq Df Pr(>Chisq)
AGE 12.019 1 0.0005266 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is a statistically significant, but very mild, decrease with age, which might also be consistent with a biological effect or a technical artifact?
cytok_summary |>
ggplot(aes(x = RACEGR2, y = `mean z-log10(conc.)`)) +
geom_boxplot(outlier.shape = NA) +
geom_path(aes(col = USUBJID, group = USUBJID), position = position_dodge(width = 0.4)) +
geom_point(aes(col = USUBJID, group = USUBJID), position = position_dodge(width = 0.4), alpha = 0.7) +
guides(col = "none")# lm_model <- lm(`mean z-log10(conc.)` ~ RACEGR2 + USUBJID, data = cytok_summary)
# summary(lm_model)
lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ RACEGR2 + (1|USUBJID), data = cytok_summary)
summary(lme_model)Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ RACEGR2 + (1 | USUBJID)
Data: cytok_summary
REML criterion at convergence: 2210.8
Scaled residuals:
Min 1Q Median 3Q Max
-2.4081 -0.6215 -0.1540 0.5512 3.7666
Random effects:
Groups Name Variance Std.Dev.
USUBJID (Intercept) 0.1830 0.4278
Residual 0.3055 0.5527
Number of obs: 1151, groups: USUBJID, 212
Fixed effects:
Estimate Std. Error t value
(Intercept) -0.04999 0.05593 -0.894
RACEGR2White 0.10408 0.07815 1.332
RACEGR2Other 0.02159 0.08850 0.244
Correlation of Fixed Effects:
(Intr) RACEGR2W
RACEGR2Whit -0.716
RACEGR2Othr -0.632 0.452
car::Anova(lme_model, type = "II")Analysis of Deviance Table (Type II Wald chisquare tests)
Response: mean z-log10(conc.)
Chisq Df Pr(>Chisq)
RACEGR2 1.9349 2 0.3801
No significant differences here.
cytok_summary |>
filter(!is.na(BC)) |>
ggplot(aes(x = BC, y = `mean z-log10(conc.)`)) +
geom_boxplot(outlier.shape = NA) +
geom_path(aes(col = USUBJID, group = USUBJID), position = position_dodge(width = 0.4)) +
geom_point(aes(col = USUBJID, group = USUBJID), position = position_dodge(width = 0.4), alpha = 0.7) +
guides(col = "none")# lm_model <- lm(`mean z-log10(conc.)` ~ BC + USUBJID, data = cytok_summary)
# summary(lm_model)
lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ BC + (1|USUBJID), data = cytok_summary)
summary(lme_model)Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ BC + (1 | USUBJID)
Data: cytok_summary
REML criterion at convergence: 2201.5
Scaled residuals:
Min 1Q Median 3Q Max
-2.2238 -0.6172 -0.1421 0.5450 3.7989
Random effects:
Groups Name Variance Std.Dev.
USUBJID (Intercept) 0.1719 0.4146
Residual 0.3054 0.5526
Number of obs: 1150, groups: USUBJID, 211
Fixed effects:
Estimate Std. Error t value
(Intercept) -0.00498 0.08537 -0.058
BCP only -0.03329 0.15367 -0.217
BCIUD (Non-hormonal) 0.30147 0.13446 2.242
BCIUD (Hormonal) 0.10550 0.12110 0.871
BCNon-hormonal -0.07735 0.09178 -0.843
BCunknown -0.11842 0.16258 -0.728
Correlation of Fixed Effects:
(Intr) BCPonl BCIUD(N BCIUD(H BCNn-h
BCP only -0.536
BCIUD(Nn-h) -0.635 0.340
BCIUD(Hrmn) -0.704 0.377 0.447
BCNon-hrmnl -0.877 0.470 0.557 0.620
BCunknown -0.492 0.264 0.313 0.353 0.446
car::Anova(lme_model, type = "II")Analysis of Deviance Table (Type II Wald chisquare tests)
Response: mean z-log10(conc.)
Chisq Df Pr(>Chisq)
BC 13.733 5 0.0174 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is a significant effect of the birth control method. Again, that may indicate a biological effect (copper IUD may increase overall inflammation) or a technical artifact (some birth controls may alter the mucus and how much material may be typically collected on swab).
cytok_summary |>
ggplot(aes(x = prop_Lacto, y = `mean z-log10(conc.)`)) +
geom_point() +
geom_smooth(se = FALSE) +
cytok_summary |>
ggplot(aes(x = prop_non_iners_Lacto, y = `mean z-log10(conc.)`)) +
geom_point() +
geom_smooth(se = FALSE)`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 4 rows containing non-finite outside the scale range (`stat_smooth()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ prop_Lacto + (1|USUBJID), data = cytok_summary)
summary(lme_model)Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ prop_Lacto + (1 | USUBJID)
Data: cytok_summary
REML criterion at convergence: 2198.3
Scaled residuals:
Min 1Q Median 3Q Max
-2.3427 -0.6161 -0.1482 0.5346 3.8015
Random effects:
Groups Name Variance Std.Dev.
USUBJID (Intercept) 0.1802 0.4245
Residual 0.3050 0.5523
Number of obs: 1147, groups: USUBJID, 211
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.05137 0.04068 1.263
prop_Lacto -0.11173 0.04544 -2.459
Correlation of Fixed Effects:
(Intr)
prop_Lacto -0.557
car::Anova(lme_model, type = "II")Analysis of Deviance Table (Type II Wald chisquare tests)
Response: mean z-log10(conc.)
Chisq Df Pr(>Chisq)
prop_Lacto 6.0452 1 0.01394 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lme_model <- lme4::lmer(`mean z-log10(conc.)` ~ prop_non_iners_Lacto + (1|USUBJID), data = cytok_summary)
summary(lme_model)Linear mixed model fit by REML ['lmerMod']
Formula: `mean z-log10(conc.)` ~ prop_non_iners_Lacto + (1 | USUBJID)
Data: cytok_summary
REML criterion at convergence: 2202.5
Scaled residuals:
Min 1Q Median 3Q Max
-2.3529 -0.6256 -0.1529 0.5509 3.7684
Random effects:
Groups Name Variance Std.Dev.
USUBJID (Intercept) 0.1805 0.4248
Residual 0.3064 0.5535
Number of obs: 1147, groups: USUBJID, 211
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.01094 0.03616 0.303
prop_non_iners_Lacto -0.06620 0.05531 -1.197
Correlation of Fixed Effects:
(Intr)
prp_nn_nr_L -0.353
car::Anova(lme_model, type = "II")Analysis of Deviance Table (Type II Wald chisquare tests)
Response: mean z-log10(conc.)
Chisq Df Pr(>Chisq)
prop_non_iners_Lacto 1.4325 1 0.2314
Here, we see a mild but statistically significant association with the proportion of Lactobacillus in the sample. The effect is very small and accounts for very little of the observed variance.
Based on these plots, we conclude that technical variability such as variability in the total amount of material collected on swabs might be as or more important than biological factors in driving the size effect observed in the cytokine concentrations.
Consequently, we adjust for this size effect, thereby attempting to correct for the total amount of material collected on swab. We do this adjustment by removing the first principal component of the scaled log concentrations.
The PCA of the scaled log(concentrations) also highlight that strong size effect.
cytokines <-
get_assay_wide_format(mae, "cytokine_log10") |>
left_join(
get_assay_wide_format(mae, "CST", add_colData = FALSE) |> unnest(assay),
by = "Barcode"
) |>
left_join(
get_assay_wide_format(mae, "prop_Lacto", add_colData = FALSE) |> unnest(assay),
by = "Barcode"
)
cytokine_pca <- prcomp(cytokines$assay |> set_rownames(cytokines$Barcode), scale = TRUE)Warning: Setting row names on a tibble is deprecated.
cytokine_pca_var <-
cytokine_pca |>
broom::tidy(matrix = "eigenvalues") plot_pca_var(cytokine_pca_var)cytokine_pca_var|>
dplyr::rename(`% var` = percent, `cumulative % var` = cumulative) |>
gt() |>
fmt_percent(columns = c(`% var`, `cumulative % var`)) |>
fmt_number(columns = std.dev, decimals = 2)| PC | std.dev | % var | cumulative % var |
|---|---|---|---|
| 1 | 3.00 | 49.85% | 49.85% |
| 2 | 1.20 | 8.06% | 57.90% |
| 3 | 1.11 | 6.83% | 64.74% |
| 4 | 1.04 | 5.98% | 70.72% |
| 5 | 0.92 | 4.68% | 75.41% |
| 6 | 0.83 | 3.83% | 79.23% |
| 7 | 0.75 | 3.16% | 82.39% |
| 8 | 0.70 | 2.71% | 85.10% |
| 9 | 0.69 | 2.61% | 87.71% |
| 10 | 0.62 | 2.14% | 89.84% |
| 11 | 0.58 | 1.88% | 91.72% |
| 12 | 0.55 | 1.70% | 93.42% |
| 13 | 0.53 | 1.57% | 95.00% |
| 14 | 0.50 | 1.40% | 96.40% |
| 15 | 0.47 | 1.21% | 97.61% |
| 16 | 0.45 | 1.15% | 98.76% |
| 17 | 0.37 | 0.77% | 99.52% |
| 18 | 0.29 | 0.47% | 100.00% |
plot_pca_correlation_circle(cytokines$assay, cytokine_pca, cytokine_pca_var)Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
The first principal component capture the size effect and we see that it is very correlated with the “mean scaled log10(concentration)” we computed above:
cytok_summary |>
left_join(cytokine_pca$x |> as.data.frame() |> rownames_to_column("Barcode"), by = join_by(Barcode)) |>
ggplot(aes(x = PC1, y = `mean z-log10(conc.)`)) +
geom_point()Such that the relationships we highlighted above can also be seen in the PCA space:
Sample_PCs <- cytokine_pca |> broom::augment(cytokines |> select(-assay))
plot_pca_samples(Sample_PCs, pca_var = cytokine_pca_var, color_by = "prop_Lacto") +
scale_color_gradient2(
"Prop Lactobacillus",
low = "red", high = "dodgerblue3", mid = "gray80", midpoint = 0.5) +
theme(legend.position = "top") #+ coord_fixed()Sample_PCs |>
ggplot(aes(x = prop_Lacto, y = .fittedPC1)) +
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "lm") `geom_smooth()` using formula = 'y ~ x'
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
lme_model <- lme4::lmer(.fittedPC1 ~ prop_Lacto + (1|USUBJID), data = Sample_PCs)
summary(lme_model)Linear mixed model fit by REML ['lmerMod']
Formula: .fittedPC1 ~ prop_Lacto + (1 | USUBJID)
Data: Sample_PCs
REML criterion at convergence: 5523.6
Scaled residuals:
Min 1Q Median 3Q Max
-2.3360 -0.6077 -0.1474 0.5510 3.6575
Random effects:
Groups Name Variance Std.Dev.
USUBJID (Intercept) 3.385 1.840
Residual 5.542 2.354
Number of obs: 1147, groups: USUBJID, 211
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.2342 0.1750 1.338
prop_Lacto -0.5090 0.1940 -2.624
Correlation of Fixed Effects:
(Intr)
prop_Lacto -0.552
car::Anova(lme_model, type = "II")Analysis of Deviance Table (Type II Wald chisquare tests)
Response: .fittedPC1
Chisq Df Pr(>Chisq)
prop_Lacto 6.8844 1 0.008695 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lme_model <- lme4::lmer(.fittedPC2 ~ prop_Lacto + (1|USUBJID), data = Sample_PCs)
summary(lme_model)Linear mixed model fit by REML ['lmerMod']
Formula: .fittedPC2 ~ prop_Lacto + (1 | USUBJID)
Data: Sample_PCs
REML criterion at convergence: 3148.7
Scaled residuals:
Min 1Q Median 3Q Max
-4.1765 -0.5471 0.0158 0.5903 3.1020
Random effects:
Groups Name Variance Std.Dev.
USUBJID (Intercept) 0.2900 0.5385
Residual 0.7376 0.8588
Number of obs: 1147, groups: USUBJID, 211
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.85752 0.05708 15.02
prop_Lacto -1.68852 0.06929 -24.37
Correlation of Fixed Effects:
(Intr)
prop_Lacto -0.606
car::Anova(lme_model, type = "II")Analysis of Deviance Table (Type II Wald chisquare tests)
Response: .fittedPC2
Chisq Df Pr(>Chisq)
prop_Lacto 593.86 1 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There see a very mild (but statistically significant) relationship between the first PC and the proportion of Lactobacillus, but a much stronger one with the 2nd PC.
plot_pca_samples(Sample_PCs, pca_var = cytokine_pca_var, color_by = "CST") +
scale_color_manual(
values = get_topic_colors(Sample_PCs$CST |> unique() |> sort())
)plot_pca_samples(Sample_PCs, pca_var = cytokine_pca_var, color_by = "BC") +
stat_ellipse()Too few points to calculate an ellipse
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_path()`).
plot_pca_samples(
Sample_PCs |> filter(AVISITN %in% c(0, 7)),
pca_var = cytokine_pca_var, color_by = "BC") +
stat_ellipse() +
facet_grid(AVISITN ~ ARM) Too few points to calculate an ellipse
Too few points to calculate an ellipse
Too few points to calculate an ellipse
Too few points to calculate an ellipse
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_path()`).
plot_pca_samples(
Sample_PCs |>
filter(PIPV) |>
mutate(Visit = str_c("Visit: ", AVISITN) |> factor()),
pca_var = cytokine_pca_var, color_by = "Visit") +
facet_grid(ARM ~ Visit) +
guides(col = "none")To remove that size effect, we remove the first PC from the scaled log2-transformed concentrations. This is almost equivalent to regressing out the mean scaled log2(concentration) from the data.
To remove the PC1 from the data, we set the first PC to 0 then multiply by the inverse of the rotation matrix.
Indeed, we have:
\(E = XR\) where \(E\) are the coordinates in the PC space, \(X\) are the coordinates in the original space (the scaled log2-transformed concentrations), and \(R\) is the rotation matrix.
X <- cytokines$assay |> as.matrix() |> scale()
(cytokine_pca$x == X %*% cytokine_pca$rotation) |> all()[1] TRUE
So, we have \(X = E R^{-1}\):
X_hat <- cytokine_pca$x %*% solve(cytokine_pca$rotation)
all(round(X_hat,6) == round(X, 6))[1] TRUE
Consequently, setting \(E[,1]\) to 0 (i.e., we project the first PC on its axis) and multiplying by the inverse rotation is equivalent to removing the first PC from the original data.
PC1_removed <-
cytokine_pca$x |> as.data.frame() |> mutate(PC1 = 0) |> as.matrix()
PC1_removed <- PC1_removed %*% solve(cytokine_pca$rotation)PC1_removed_long <-
PC1_removed |>
as_tibble() |>
mutate(Barcode = cytokines$Barcode) |>
pivot_longer(
cols = -Barcode,
names_to = "cytokine",
values_to = "PC1_removed_log10_conc"
) |>
left_join(cytokines |> select(-assay), by = join_by(Barcode))Below, we show the relationships between the scaled log2(concentrations) and the PC1-removed log2(concentrations).
log10_conc <-
PC1_removed_long |>
left_join(
get_assay_long_format(
mae, "cytokine_log10", add_colData = FALSE,
feature_name = "cytokine", values_name = "log10_concentration"
) |>
group_by(cytokine) |> mutate(scaled_log10_concentration = scale(log10_concentration)) |> ungroup(),
by = join_by(Barcode, cytokine)
)
ggplot(log10_conc, aes(x = scaled_log10_concentration, y = PC1_removed_log10_conc)) +
geom_point(size = 0.5, alpha = 0.25) +
facet_wrap(cytokine ~ .) +
xlab("scaled log10(concentration)") + ylab("PC1-removed scaled log10(concentration)")And the correlations between cytokines after PC1 removal
dim(PC1_removed)
# rownames(PC1_removed)
# colnames(PC1_removed)n_cytok <- ncol(PC1_removed)
cytok_names <- colnames(PC1_removed)
PC1_subtracted_cytok_corr <- cor(PC1_removed)
PC1_subtracted_cytok_corr[PC1_subtracted_cytok_corr == 1] <- NA
res <- Hmisc::rcorr(PC1_removed)$P |> p.adjust("BH")
res <- res |> matrix(nrow = n_cytok, ncol = n_cytok) |> set_colnames(cytok_names) |> set_rownames(cytok_names)
corrplot::corrplot(PC1_subtracted_cytok_corr, method = "color", addCoef.col = "black", tl.col = "black", tl.srt = 45, p.mat = res, sig.level = 0.05)# cor(PC1_removed) |>
# as_tibble() |>
# mutate(cytokine_1 = colnames(PC1_removed)) |>
# pivot_longer(cols = -cytokine_1, names_to = "cytokine_2", values_to = "cor") |>
# filter(cor != 1) |>
# ggplot(aes(x = cytokine_1, y = cytokine_2, fill = cor)) +
# geom_tile() +
# scale_fill_gradient2(
# low = "coral", mid = "white", high = "turquoise4", midpoint = 0,
# breaks = seq(-1, 1, by = 0.25), limits = c(-1,1)
# ) +
# xlab("") + ylab("") +
# theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
# coord_fixed()We see that now, some cytokines are correlated and some are anti-correlated. Of particular interest, we see now that IP-10 and IL-1b are anti-correlated, which is consistent with past literature.
In addition, we check whether see PC1-removed log2(concentrations) follow the expected (based on previous publications) relationships with proportion of Lactobacillus (and CSTs).
ggplot(PC1_removed_long, aes(x = prop_Lacto, y = PC1_removed_log10_conc)) +
geom_point(size = 0.35, alpha = 0.5) +
geom_smooth(method = "lm", formula = "y ~ x") +
facet_grid(. ~ cytokine) +
scale_x_continuous(breaks = seq(0, 1, 0.5)) +
ylab("PC1-removed scaled log10(concentration)")Warning: Removed 72 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 72 rows containing missing values or values outside the scale range
(`geom_point()`).
ggplot(PC1_removed_long |> filter(!is.na(CST), CST != "II"),
aes(x = CST, y = PC1_removed_log10_conc, fill = CST)) +
geom_boxplot(outlier.size = 0.5) +
facet_grid(. ~ cytokine) +
scale_fill_manual(
breaks = PC1_removed_long$CST |> unique() |> sort(),
values = get_topic_colors(PC1_removed_long$CST |> unique() |> sort())
) +
guides(fill = "none") +
ylab("PC1-removed scaled log10(concentration)")When considering original scaled log10-concentrations, we have:
ggplot(log10_conc, aes(x = prop_Lacto, y = scaled_log10_concentration)) +
geom_point(size = 0.35, alpha = 0.5) +
geom_smooth(method = "lm", formula = "y ~ x") +
facet_grid(. ~ cytokine) +
scale_x_continuous(breaks = seq(0, 1, 0.5)) +
ylab("Scaled log10(concentration)")Warning: Removed 72 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 72 rows containing missing values or values outside the scale range
(`geom_point()`).
ggplot(log10_conc |> filter(!is.na(CST), CST != "II"),
aes(x = CST, y = scaled_log10_concentration, fill = CST)) +
geom_boxplot(outlier.size = 0.5) +
facet_grid(. ~ cytokine) +
scale_fill_manual(
breaks = PC1_removed_long$CST |> unique() |> sort(),
values = get_topic_colors(PC1_removed_long$CST |> unique() |> sort())
) +
guides(fill = "none") +
ylab("Scaled log10(concentration)")The table below provide the correlations between the cytokine log10-concentrations and the proportion of Lactobacillus before and after PC1 removal.
tmp <-
cbind(
cor(cytokines$assay, cytokines$prop_Lacto, use = "complete", method = "spearman"),
cor(PC1_removed, cytokines$prop_Lacto, use = "complete", method = "spearman")
) |>
set_colnames(c("log10(conc.)", "PC1 removed z-log10(conc.)"))
tmp |>
as.data.frame() |>
rownames_to_column("cytokine") |>
pivot_longer(
cols = -cytokine,
names_to = "type",
values_to = "correlation"
) |>
mutate(type = type |> fct_inorder()) |>
ggplot() +
aes(y = cytokine, x = type, fill = correlation) +
geom_tile() +
geom_text(aes(label = round(correlation, 2)), size = 3) +
scale_fill_gradient2() +
xlab("") +
ggtitle("Spearman correlations between cytokine concentrations and\nproportion of Lactobacillus for each cytokine")# tmp |>
# knitr::kable(caption = "Spearman correlations between cytokine concentrations and proportion of Lactobacillus for each cytokine")These relationships appear similar to those that have been observed in past studies (and, of particular interest, studies relying on CVL samples which are probably less sensitive in terms of total material contained in samples).
mae objectSince we believe that PC1 subtraction may in removing technical artifact due to the total amount on swab, we save these transformed concentrations to the mae object.
transformed_cytokines_assay <- list()
transformed_cytokines_assay[["cytokine_log10_SE_corrected"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = PC1_removed |> t() |> set_colnames(cytokines$Barcode)
)
mae <- c(mae, transformed_cytokines_assay)One important thing to note is that removing the size effect will lead the imputed LLOQ and ULOQ values (which were all imputed to the same values) to have different values based on the estimated effect size of each samples. For example, a LLOQ value in a sample with a high PC1 (~ a high total amount of material) will have a lower value after PC1-removal than a LLOQ value from a sample with a small PC1 (assumed to have less material on swab). So, the effect on LLOQ and ULOQ values depends on how well the size effect could be estimated, which, in turn, depends on the number of out-of-range values in the sample and our trust in cytokines’ transformed concentrations also depends on how many cytokines had observable data.
We should thus limit analyses relying on PC1-subtracted values to cytokines with sufficient data within the limits of quantification (and to samples with sufficient number of within-range cytokines).
Several cytokines have concentrations that were outside the limits of quantification (LLOQ and ULOQ) for a large fraction of samples. We remove these cytokines from the analysis.
cytok <-
get_assay_long_format(mae, "cytokine_log10", feature_name = "cytokine", values_name = "log10_concentration") |>
left_join(
get_assay_long_format(mae, "cytokine_log10_SE_corrected", feature_name = "cytokine", values_name = "transformed_concentration", add_colData = FALSE),
by = join_by(Barcode, cytokine)
) |>
group_by(cytokine) |>
mutate(
value_type =
case_when(
log10_concentration == min(log10_concentration) ~ "LLOQ",
log10_concentration == max(log10_concentration) ~ "ULOQ",
TRUE ~ "within_range"
)
) |>
ungroup() |>
select(Barcode, cytokine, value_type, log10_concentration, transformed_concentration, everything())
fraction_at_LLOQ <-
cytok |>
group_by(cytokine) |>
summarize(
n_LLOQ = sum(log10_concentration == min(log10_concentration)),
n_ULOQ = sum(log10_concentration == max(log10_concentration)),
n_tot = n()
) |>
mutate(
`fraction below LLOQ` = n_LLOQ/n_tot,
`fraction above ULOQ` = n_ULOQ/n_tot
) |>
arrange(-`fraction below LLOQ`)
fraction_at_LLOQ |> gt()| cytokine | n_LLOQ | n_ULOQ | n_tot | fraction below LLOQ | fraction above ULOQ |
|---|---|---|---|---|---|
| IL-10 | 862 | 1 | 1151 | 0.7489139878 | 0.0008688097 |
| IL-4 | 838 | 1 | 1151 | 0.7280625543 | 0.0008688097 |
| IL-13 | 721 | 1 | 1151 | 0.6264118158 | 0.0008688097 |
| IL-21 | 719 | 1 | 1151 | 0.6246741964 | 0.0008688097 |
| TNFa | 662 | 1 | 1151 | 0.5751520417 | 0.0008688097 |
| IL-23 | 590 | 1 | 1151 | 0.5125977411 | 0.0008688097 |
| IL-17 | 570 | 1 | 1151 | 0.4952215465 | 0.0008688097 |
| MIP-1b | 555 | 1 | 1151 | 0.4821894005 | 0.0008688097 |
| ITAC | 527 | 1 | 1151 | 0.4578627281 | 0.0008688097 |
| IFNg | 476 | 1 | 1151 | 0.4135534318 | 0.0008688097 |
| IL-6 | 423 | 1 | 1151 | 0.3675065161 | 0.0008688097 |
| IL-1b | 313 | 4 | 1151 | 0.2719374457 | 0.0034752389 |
| MIP-3a | 213 | 1 | 1151 | 0.1850564726 | 0.0008688097 |
| MIP-1a | 171 | 1 | 1151 | 0.1485664639 | 0.0008688097 |
| MIG | 128 | 55 | 1151 | 0.1112076455 | 0.0477845352 |
| IP-10 | 62 | 25 | 1151 | 0.0538662033 | 0.0217202433 |
| IL-1a | 9 | 7 | 1151 | 0.0078192876 | 0.0060816681 |
| IL-8 | 1 | 399 | 1151 | 0.0008688097 | 0.3466550825 |
threshold_LLOQ <- 0.6
threshold_ULOQ <- 0.3fraction_at_LLOQ <-
fraction_at_LLOQ |>
mutate(
included =
(`fraction below LLOQ` < threshold_LLOQ) &
(`fraction above ULOQ` < threshold_ULOQ)
)
included_cytokines <- fraction_at_LLOQ$cytokine[fraction_at_LLOQ$included] |> as.character()
excluded_cytokines <- fraction_at_LLOQ$cytokine[!fraction_at_LLOQ$included] |> as.character()cytok <- cytok |> left_join(fraction_at_LLOQ, by = join_by(cytokine)) g <-
ggplot(cytok, aes(x = log10_concentration, fill = included)) +
geom_histogram(bins = 50) +
facet_wrap(cytokine ~ ., nrow = 3) +
theme(legend.position = "bottom") +
labs(
x = "log10(imputed concentrations)",
caption = str_c("Samples were excluded if more than ", threshold_LLOQ*100,"% of their values were below the LLOQ or more than ",threshold_ULOQ*100 ,"% above the ULOQ")
)
gggplot(cytok, aes(x = transformed_concentration)) +
geom_histogram(aes(fill = value_type), bins = 100) +
geom_text(data = fraction_at_LLOQ |> mutate(included = ifelse(included, "included","excluded")),
aes(label = included, col = included), x = Inf, y = Inf, hjust = 1, vjust = 1) +
facet_wrap(cytokine ~ ., nrow = 3) +
theme(legend.position = "bottom") ggsave(g, filename = str_c(fig_out_dir(), "S4A.pdf"), width = 10, height = 6, device = cairo_pdf)Specifically, we exclude cytokines with concentration below the LLOQ for over 60% of samples or above the ULOQ for over 30% of samples.
Based on these criteria, we exclude IL-10, IL-4, IL-13, IL-21, IL-8 from downstream analyses.
We add a new assay to the mae object with the filtered cytokines.
filtered_cytokines <- MultiAssayExperiment::assay(mae, "cytokine_log10_SE_corrected")
filtered_cytokines <- filtered_cytokines[included_cytokines,]
filtered_cytokines_assay <- list()
filtered_cytokines_assay[["cytokine_log10_SE_corrected_incl_cytokine"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = filtered_cytokines
)
mae <- c(mae, filtered_cytokines_assay)In case one is interested in also doing the analysis on the non-PC1-substracted cytokine data, we also add this assay to the mae object.
filtered_cytokines <- MultiAssayExperiment::assay(mae, "cytokine_log10")
filtered_cytokines <- filtered_cytokines[included_cytokines,]
filtered_cytokines_assay <- list()
filtered_cytokines_assay[["cytokine_log10_incl_cytokine"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = filtered_cytokines
)
mae <- c(mae, filtered_cytokines_assay)cytok <- cytok |> mutate(Visit = AVISITN |> get_visit_labels())
cytok_filtered <- cytok |> filter(cytokine %in% included_cytokines)
sample_summary <-
cytok |>
group_by(Barcode, Visit, USUBJID, PIPV, ARM) |>
summarize(
n_within_range = sum(value_type == "within_range"),
n_tot = n(),
.groups = "drop"
) |>
arrange(USUBJID, Visit)
sample_summary <-
sample_summary |>
group_by(USUBJID) |> mutate(mean_n_within_range = mean(n_within_range)) |> ungroup() |>
arrange(mean_n_within_range) |> mutate(USUBJID = USUBJID |> fct_inorder())
sample_summary |>
filter(PIPV) |>
ggplot(aes(x = Visit, y = USUBJID)) +
geom_tile(aes(fill = n_within_range)) +
scale_fill_gradient2(low = "red", mid = "gray", high = "steelblue1", midpoint = 5.5, breaks = seq(0, 13, by = 3)) +
facet_grid(ARM ~ ., scales = "free_y", space = "free") Again, we see a strong participant and visit effect on the number of out-of-range values.
sample_summary |>
filter(PIPV) |>
ggplot(aes(x = Visit, y = n_within_range, fill = ARM)) +
geom_boxplot() +
scale_fill_manual(values = get_arm_colors()) sample_summary |>
ggplot(aes(x = n_within_range, fill = ARM)) +
geom_vline(xintercept = 5.5, linetype = 2, col = "red") +
geom_histogram(binwidth = 0.5) +
facet_grid(ARM ~ ., scales = "free_y") +
scale_fill_manual(values = get_arm_colors()) We flag samples with strictly less than 5 cytokines within the limits of detection:
flagged_samples <- sample_summary |> filter(n_within_range < 5)
included_samples <- sample_summary |> filter(n_within_range >= 5)
flagged_samples |>
group_by(USUBJID, ARM) |>
summarize(n_flagged_samples = n(), .groups = "drop") |>
arrange(-n_flagged_samples) |>
gt()| USUBJID | ARM | n_flagged_samples |
|---|---|---|
| STI.00224 | LACTIN-V | 4 |
| STI.00435 | LACTIN-V | 2 |
| STI.00469 | LACTIN-V | 2 |
| STI.00219 | LACTIN-V | 2 |
| STI.00476 | LACTIN-V | 2 |
| STI.00266 | Placebo | 2 |
| STI.00307 | Placebo | 2 |
| STI.00616 | LACTIN-V | 1 |
| STI.00226 | LACTIN-V | 1 |
| STI.00200 | Placebo | 1 |
| STI.00987 | Placebo | 1 |
| STI.00627 | Placebo | 1 |
| STI.00268 | LACTIN-V | 1 |
| STI.01200 | LACTIN-V | 1 |
| STI.00285 | LACTIN-V | 1 |
| STI.00405 | LACTIN-V | 1 |
| STI.00461 | Placebo | 1 |
| STI.00320 | Placebo | 1 |
| STI.00401 | LACTIN-V | 1 |
| STI.00332 | Placebo | 1 |
| STI.00693 | LACTIN-V | 1 |
| STI.00233 | Placebo | 1 |
| STI.00452 | LACTIN-V | 1 |
| STI.00632 | Placebo | 1 |
| STI.00350 | Placebo | 1 |
| STI.00337 | LACTIN-V | 1 |
| STI.00406 | LACTIN-V | 1 |
| STI.01152 | LACTIN-V | 1 |
| STI.01089 | Placebo | 1 |
| STI.00568 | LACTIN-V | 1 |
| STI.01024 | LACTIN-V | 1 |
| STI.00462 | LACTIN-V | 1 |
| STI.00819 | Placebo | 1 |
| STI.00453 | LACTIN-V | 1 |
| STI.00758 | Placebo | 1 |
| STI.00683 | LACTIN-V | 1 |
| STI.00194 | LACTIN-V | 1 |
| STI.00964 | LACTIN-V | 1 |
| STI.00907 | LACTIN-V | 1 |
| STI.00458 | Placebo | 1 |
| STI.00394 | LACTIN-V | 1 |
| STI.00434 | Placebo | 1 |
| STI.00820 | LACTIN-V | 1 |
| STI.01179 | LACTIN-V | 1 |
flagged_samples |>
mutate(Arm = ARM |> str_replace("ACTIN", "actin")) |>
arrange(Visit, Arm) |>
group_by(Visit, Arm) |>
summarize(`n flagged samples` = n(), .groups = "drop") |>
pivot_wider(names_from = Arm, values_from = `n flagged samples`) |>
gt(
rowname_col = "Visit",
caption = "Number of flagged samples"
) |>
grand_summary_rows(
columns = -c("Visit"),
fns = list(label = "Total", id = "totals", fn = "sum"),
fmt = ~ fmt_integer(.),
side = "bottom"
) | Lactin-V | Placebo | |
|---|---|---|
| Pre-MTZ | 7 | 2 |
| Post-MTZ | 11 | 7 |
| Week 4 | 2 | 2 |
| Week 8 | 7 | 1 |
| Week 12 | 6 | 2 |
| Week 24 | 2 | 4 |
| Total | 35 | 18 |
transformed_cytokines <- MultiAssayExperiment::assay(mae, "cytokine_log10_SE_corrected_incl_cytokine")
transformed_cytokines <- transformed_cytokines[,included_samples$Barcode]
transformed_cytokines_assay <- list()
transformed_cytokines_assay[["cytokine_transformed"]] <-
SummarizedExperiment::SummarizedExperiment(
assay = transformed_cytokines
)
mae <- c(mae, transformed_cytokines_assay)In the cytokine_log10_incl_cytokine assay, we only add a column to the assay colData to flag these samples.
tmp <- mae[["cytokine_log10_incl_cytokine"]]@colData |> as.data.frame() |> rownames_to_column("Barcode") |> as_tibble()
tmp <-
tmp |>
select(-any_of(c("n_within_range", "flagged"))) |>
left_join(
sample_summary |> select(Barcode, n_within_range) |> mutate(flagged = n_within_range < 5),
by = join_by(Barcode)
)
mae[["cytokine_log10_incl_cytokine"]]$n_within_range <- tmp$n_within_range
mae[["cytokine_log10_incl_cytokine"]]$flagged <- tmp$flaggedlog10_conc <-
log10_conc |>
select(-any_of("sample")) |>
left_join(included_samples |> select(Barcode) |> mutate(sample = "included"), by = join_by(Barcode)) |>
select(Barcode, cytokine, PC1_removed_log10_conc, scaled_log10_concentration, sample) |>
mutate(sample = sample |> str_replace_na("flagged"))
ggplot(log10_conc) +
aes(x = scaled_log10_concentration, y = PC1_removed_log10_conc, col = sample) +
geom_point(size = 0.5, alpha = 0.25) +
facet_wrap(cytokine ~ .) +
xlab("scaled log2(concentration)") + ylab("PC1-removed scaled log2(concentration)")Finally, to ease the interpretation of downstream analyses, we re-order the cytokines such that correlated ones are grouped together.
PC1_removed <- get_assay_wide_format(mae, "cytokine_log10_SE_corrected")
PC1_removed <- PC1_removed$assay |> as.data.frame() |> set_rownames(PC1_removed$Barcode)cor_c <- cor(PC1_removed)
hclust_cor <- hclust((1 - cor_c) |> as.dist())
ordered_cytokines <- colnames(PC1_removed)[hclust_cor$order]corrplot::corrplot(cor_c[ordered_cytokines, ordered_cytokines], method = "color", tl.col = "black", tl.srt = 45)Modifying the existing mae assays:
ac <- mae[["cytokine"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine"]] <- ac[j,]
ac <- mae[["cytokine_log10"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_log10"]] <- ac[j,]
ac <- mae[["cytokine_log10_SE_corrected"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_log10_SE_corrected"]] <- ac[j,]
ac <- mae[["cytokine_log10_SE_corrected_incl_cytokine"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_log10_SE_corrected_incl_cytokine"]] <- ac[j,]
ac <- mae[["cytokine_log10_incl_cytokine"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_log10_incl_cytokine"]] <- ac[j,]
ac <- mae[["cytokine_transformed"]]
j <- ordered_cytokines |> intersect(rownames(ac))
mae[["cytokine_transformed"]] <- ac[j,]maeWe can now save this augmented mae object.
output_dir <- str_c(data_dir(), "03_augmented_mae/")
save_mae(mae, output_dir)MAE is identical to the last one. Not saving.
# If dropbox mounting was done manually, remember to unmount it.As a reminder, the “raw” mae object had the following assays:
raw_maeA MultiAssayExperiment object of 7 listed
experiments with user-defined names and respective classes.
Containing an ExperimentList class object of length 7:
[1] swabs: SummarizedExperiment with 2 rows and 1914 columns
[2] ASV_16S_all: SummarizedExperiment with 9369 rows and 1176 columns
[3] ASV_16S: SummarizedExperiment with 9369 rows and 1174 columns
[4] tax_16S: SummarizedExperiment with 220 rows and 1174 columns
[5] cytokine: SummarizedExperiment with 18 rows and 1151 columns
[6] MG_all_Lc_strains: SummarizedExperiment with 301 rows and 1110 columns
[7] MG_CTV05: SummarizedExperiment with 4 rows and 1110 columns
Functionality:
experiments() - obtain the ExperimentList instance
colData() - the primary/phenotype DataFrame
sampleMap() - the sample coordination DataFrame
`$`, `[`, `[[` - extract colData columns, subset, or experiment
*Format() - convert into a long or wide DataFrame
assays() - convert ExperimentList to a SimpleList of matrices
exportClass() - save data to flat files
Now, the mae object has these assays + additional ones:
maeA MultiAssayExperiment object of 24 listed
experiments with user-defined names and respective classes.
Containing an ExperimentList class object of length 24:
[1] swabs: SummarizedExperiment with 2 rows and 1914 columns
[2] ASV_16S_all: SummarizedExperiment with 9369 rows and 1176 columns
[3] ASV_16S: SummarizedExperiment with 9369 rows and 1174 columns
[4] tax_16S: SummarizedExperiment with 220 rows and 1174 columns
[5] cytokine: SummarizedExperiment with 18 rows and 1151 columns
[6] MG_all_Lc_strains: SummarizedExperiment with 301 rows and 1110 columns
[7] MG_CTV05: SummarizedExperiment with 4 rows and 1110 columns
[8] ASV_16S_filtered: SummarizedExperiment with 780 rows and 1174 columns
[9] ASV_16S_filtered_p: SummarizedExperiment with 780 rows and 1174 columns
[10] tax_16S_p: SummarizedExperiment with 220 rows and 1174 columns
[11] tax_CTV05_p: SummarizedExperiment with 222 rows and 1176 columns
[12] prop_Lacto: SummarizedExperiment with 6 rows and 1174 columns
[13] mb_categories: SummarizedExperiment with 1 rows and 1174 columns
[14] mb_categories_wide: SummarizedExperiment with 3 rows and 1174 columns
[15] CST: SummarizedExperiment with 3 rows and 1174 columns
[16] topics_16S_7: SummarizedExperiment with 7 rows and 1174 columns
[17] c_topics_16S_8: SummarizedExperiment with 8 rows and 1174 columns
[18] c_topics_16S_9: SummarizedExperiment with 9 rows and 1174 columns
[19] c_topics_16S_10_ctv05: SummarizedExperiment with 10 rows and 1108 columns
[20] cytokine_log10: SummarizedExperiment with 18 rows and 1151 columns
[21] cytokine_log10_SE_corrected: SummarizedExperiment with 18 rows and 1151 columns
[22] cytokine_log10_SE_corrected_incl_cytokine: SummarizedExperiment with 13 rows and 1151 columns
[23] cytokine_log10_incl_cytokine: SummarizedExperiment with 13 rows and 1151 columns
[24] cytokine_transformed: SummarizedExperiment with 13 rows and 1098 columns
Functionality:
experiments() - obtain the ExperimentList instance
colData() - the primary/phenotype DataFrame
sampleMap() - the sample coordination DataFrame
`$`, `[`, `[[` - extract colData columns, subset, or experiment
*Format() - convert into a long or wide DataFrame
assays() - convert ExperimentList to a SimpleList of matrices
exportClass() - save data to flat files
sessionInfo()R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Europe/Brussels
tzcode source: internal
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] alto_0.1.0 ValenciaR_0.1.0 phyloseq_1.50.0 ggthemes_5.1.0
[5] gt_1.0.0 viridis_0.6.5 viridisLite_0.4.2 patchwork_1.3.0
[9] magrittr_2.0.3 lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
[13] dplyr_1.1.4 purrr_1.0.4 readr_2.1.5 tidyr_1.3.1
[17] tibble_3.2.1 ggplot2_3.5.2 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] RColorBrewer_1.1-3 rstudioapi_0.17.1
[3] jsonlite_2.0.0 MultiAssayExperiment_1.32.0
[5] modeltools_0.2-24 nloptr_2.2.1
[7] corrplot_0.95 farver_2.1.2
[9] rmarkdown_2.29 fs_1.6.6
[11] zlibbioc_1.52.0 vctrs_0.6.5
[13] multtest_2.62.0 minqa_1.2.8
[15] topicmodels_0.2-17 base64enc_0.1-3
[17] htmltools_0.5.8.1 S4Arrays_1.6.0
[19] BiocBaseUtils_1.8.0 curl_6.2.3
[21] broom_1.0.8 Rhdf5lib_1.28.0
[23] SparseArray_1.6.2 Formula_1.2-5
[25] rhdf5_2.50.2 sass_0.4.10
[27] YC_0.1.0 htmlwidgets_1.6.4
[29] plyr_1.8.9 igraph_2.1.4
[31] lifecycle_1.0.4 iterators_1.0.14
[33] pkgconfig_2.0.3 Matrix_1.7-3
[35] R6_2.6.1 fastmap_1.2.0
[37] GenomeInfoDbData_1.2.13 rbibutils_2.3
[39] MatrixGenerics_1.18.1 digest_0.6.37
[41] colorspace_2.1-1 S4Vectors_0.44.0
[43] Hmisc_5.2-3 GenomicRanges_1.58.0
[45] vegan_2.6-10 philentropy_0.9.0
[47] labeling_0.4.3 timechange_0.3.0
[49] httr_1.4.7 abind_1.4-8
[51] mgcv_1.9-1 compiler_4.4.2
[53] bit64_4.6.0-1 withr_3.0.2
[55] backports_1.5.0 htmlTable_2.4.3
[57] carData_3.0-5 MASS_7.3-65
[59] DelayedArray_0.32.0 biomformat_1.34.0
[61] permute_0.9-7 CVXR_1.0-15
[63] tools_4.4.2 foreign_0.8-90
[65] ape_5.8-1 nnet_7.3-19
[67] glue_1.8.0 nlme_3.1-168
[69] rhdf5filters_1.18.1 checkmate_2.3.2
[71] cluster_2.1.8.1 reshape2_1.4.4
[73] ade4_1.7-23 generics_0.1.4
[75] lpSolve_5.6.23 gtable_0.3.6
[77] tzdb_0.5.0 data.table_1.17.4
[79] hms_1.1.3 car_3.1-3
[81] xml2_1.3.8 XVector_0.46.0
[83] BiocGenerics_0.52.0 ggrepel_0.9.6
[85] foreach_1.5.2 pillar_1.10.2
[87] vroom_1.6.5 splines_4.4.2
[89] lattice_0.22-6 survival_3.8-3
[91] gmp_0.7-5 bit_4.6.0
[93] tidyselect_1.2.1 tm_0.7-16
[95] Biostrings_2.74.1 knitr_1.50
[97] reformulas_0.4.1 gridExtra_2.3
[99] NLP_0.3-2 IRanges_2.40.1
[101] SummarizedExperiment_1.36.0 stats4_4.4.2
[103] xfun_0.52 Biobase_2.66.0
[105] T4transport_0.1.3 matrixStats_1.5.0
[107] stringi_1.8.7 UCSC.utils_1.2.0
[109] boot_1.3-31 yaml_2.3.10
[111] evaluate_1.0.3 codetools_0.2-20
[113] cli_3.6.5 rpart_4.1.23
[115] Rdpack_2.6.4 Rcpp_1.0.14
[117] GenomeInfoDb_1.42.3 parallel_4.4.2
[119] lme4_1.1-37 Rmpfr_1.1-0
[121] slam_0.1-55 scales_1.4.0
[123] crayon_1.5.3 rlang_1.1.6